LICENSE
Guided_Differentiation_Code Non-commercial License
© 2023 The University of Chicago.

Redistribution and use for noncommercial purposes in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

  1. The software is used solely for noncommercial purposes. It may not be used indirectly for commercial use, such as on a website that accepts advertising money for content. Noncommercial use does include use by a for-profit company in its research. For commercial use rights, contact The University of Chicago, Polsky Center for Entrepreneurship, and Innovation, at or call 773-702-1692 and inquire about Tech ID 22-T-080 project.

  2. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

  3. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

  4. Neither the name of The University of Chicago nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF CHICAGO AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF CHICAGO OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

FIGURE 1 CELL CLUSTERING, ANALYSIS, AND ANNOTATION

# Read 10x data
guided_raw_data <- Read10X("guided_differentiation_2021/220510_A00639_1151_BH3YKKDRX2-YG-EM-17S-lns12/YG-EM-ln1-EM314_human/outs/filtered_feature_bc_matrix/")

# Create Seurat object from raw data
guided_obj <- CreateSeuratObject(guided_raw_data, min.cells = 1)

# Add Vireo information (demultiplex to identify individuals)
add_vireo <- function(guided_obj, dir) {
  donor_ids <- read.table(paste0(dir, 'vireo/donor_ids.tsv'), 
                          header = TRUE, stringsAsFactors = FALSE)
  
  guided_obj <- AddMetaData(guided_obj, donor_ids$donor_id,   
                            col.name ='vireo.individual')
  guided_obj <- AddMetaData(guided_obj, donor_ids$prob_max,   
                            col.name ='vireo.prob.singlet')
  
  ambient.file <- paste0(dir, 'vireo/prop_ambient.tsv')
  if (file.exists(ambient.file)) {
    prop <- read.table(paste0(dir, 'vireo/prop_ambient.tsv'), 
                       header = TRUE, stringsAsFactors = FALSE)
    
    inds <- unique(donor_ids$best_singlet)
    donor_ids$prop_donor <- 0
    for (i in 1:length(inds)) {
      w <- which(donor_ids$best_singlet==inds[i])
      donor_ids$prop_donor[w] <- prop[w,inds[i]]
    }
    guided_obj <- AddMetaData(guided_obj, donor_ids$prop_donor, 
                              col.name ='vireo.prop.donor')
  }
  guided_obj
}

guided_obj <- add_vireo(guided_obj, dir)

individual.colname <- ('vireo.individual')
individual.colname <- individual.colname[which(individual.colname %in% 
                                                 colnames(guided_obj@meta.data))][1]
# Subset cells for each category
NA19114_obj <- subset(guided_obj, subset = vireo.individual == "NA19114")
NA19130_obj <- subset(guided_obj, subset = vireo.individual == "NA19130")
NA19152_obj <- subset(guided_obj, subset = vireo.individual == "NA19152")
doublet_obj <- subset(guided_obj, subset = vireo.individual == "doublet")
unassigned_obj <- subset(guided_obj, subset = vireo.individual == "unassigned")

# Function to print object information
print_info <- function(obj, name) {
  cat("\n", name, ":\n")
  print(obj)
}

# Print information for each object
print_info(NA19114_obj, "NA19114")
## 
##  NA19114 :
## An object of class Seurat 
## 30814 features across 2988 samples within 1 assay 
## Active assay: RNA (30814 features, 0 variable features)
print_info(NA19130_obj, "NA19130")
## 
##  NA19130 :
## An object of class Seurat 
## 30814 features across 3264 samples within 1 assay 
## Active assay: RNA (30814 features, 0 variable features)
print_info(NA19152_obj, "NA19152")
## 
##  NA19152 :
## An object of class Seurat 
## 30814 features across 3206 samples within 1 assay 
## Active assay: RNA (30814 features, 0 variable features)
print_info(doublet_obj, "doublet")
## 
##  doublet :
## An object of class Seurat 
## 30814 features across 710 samples within 1 assay 
## Active assay: RNA (30814 features, 0 variable features)
print_info(unassigned_obj, "unassigned")
## 
##  unassigned :
## An object of class Seurat 
## 30814 features across 50 samples within 1 assay 
## Active assay: RNA (30814 features, 0 variable features)
print_info(guided_obj, "total")
## 
##  total :
## An object of class Seurat 
## 30814 features across 10218 samples within 1 assay 
## Active assay: RNA (30814 features, 0 variable features)
# Filter out doublets, unassigned cells, and cells with fewer than 1500 features
guided_obj <- subset(guided_obj, subset = vireo.individual != "doublet")
guided_obj <- subset(guided_obj, subset = vireo.individual != "unassigned")
guided_obj <- subset(guided_obj, subset = nFeature_RNA >= 1500)

# Print total cells and features after filtering
print(guided_obj)
## An object of class Seurat 
## 30814 features across 8936 samples within 1 assay 
## Active assay: RNA (30814 features, 0 variable features)
# Apply SCTransform, PCA, UMAP, and FindNeighbors
guided_obj_sct <- SCTransform(guided_obj, 
                              variable.features.n = 5000, 
                              verbose = FALSE)
guided_obj_sct <- RunPCA(guided_obj_sct, npcs = 50, verbose = FALSE)
guided_obj_sct <- RunUMAP(guided_obj_sct, dims = 1:50, verbose = FALSE)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
# Break
guided_obj_sct <- FindNeighbors(guided_obj_sct, dims = 1:50, verbose = FALSE)
# UMAP color palette
umap_colors <- c("#117733", # green 
                 "#AA4499", # purple
                 "#CC6677", # rose
                 "#999933", # olive
                 "#4477AA", # blue
                 "#332288", # indigo
                 "#C45F00", # orange
                 "#882255", # wine
                 "#44AA99", # teal
                 "#DDCC77") # sand

# Bar plot color palette
bar_colors <- c("#C45F00", # orange
                "#882255", # wine
                "#DDCC77", # sand
                "#44AA99", # teal 
                "#332288", # indigo
                "#AA4499", # purple
                "#117733", # green
                "#4477AA", # blue
                "#999933", # olive                   
                "#CC6677") # rose
# Set cluster resolution and reorder clusters
guided_obj_sct <- FindClusters(guided_obj_sct, resolution = 0.3, verbose = FALSE)
my_levels <- c('0', '3', '2', '1', '6', '7', '8', '4', '5', '9')
guided_obj_sct@active.ident <- factor(guided_obj_sct@active.ident, 
                                      levels = my_levels)

# Name clusters by cell types
new_cluster_ids <- c("Cardiomyocytes", 
                     "Second heart field", 
                     "Fibroblasts", 
                     "Epicardial cells", 
                     "First heart field", 
                     "Endocardial cells", 
                     "Hepatic endoderm", 
                     "Foregut endoderm", 
                     "Neuroectoderm", 
                     "Pluripotent stem cells")
names(new_cluster_ids) <- levels(guided_obj_sct)
guided_obj_sct <- RenameIdents(guided_obj_sct, new_cluster_ids)
guided_obj_sct$cell_type <- Idents(guided_obj_sct)

# UMAP plot
guided_obj_sct_umap <- DimPlot(guided_obj_sct, reduction = "umap", 
                               cols = umap_colors) + NoLegend() +
  theme(axis.title = element_text(size = 8), axis.ticks = element_blank(), 
        axis.text = element_blank(), panel.background = element_blank()) +
  xlab("UMAP 1") + ylab("UMAP 2")

# Bar plot for cell type percentages
totals <- as.data.frame(table(guided_obj_sct@active.ident)) %>%
  mutate(Percent = scales::label_percent(accuracy = .1)(Freq / sum(Freq))) %>%
  rename(Type = Var1, Count = Freq) %>%
  mutate(Type = factor(Type, levels = c("Hepatic endoderm", 
                                        "Foregut endoderm", 
                                        "Pluripotent stem cells", 
                                        "Neuroectoderm", 
                                        "Endocardial cells", 
                                        "Second heart field", 
                                        "Cardiomyocytes", 
                                        "First heart field", 
                                        "Epicardial cells", 
                                        "Fibroblasts")))

bar <- ggplot(totals, aes(x = Type, y = Count, fill = Type)) +
  geom_bar(stat = "identity") +
  coord_flip(clip = "off") +
  geom_text(aes(label = Percent), size = 2, hjust = -0.01) +
  scale_fill_manual(values = bar_colors) +
  NoLegend() +
  theme(axis.title.y = element_blank(), axis.ticks.y = element_blank(), 
        axis.text.y = element_text(size = 9), 
        axis.text.x = element_text(size = 8), 
        panel.background = element_blank())

# Plot layout
umap_layout <- c(area(1, 1), area(1, 2))
p <- guided_obj_sct_umap + bar + plot_layout(design = umap_layout)

# Print plot
print(p)

# Marker gene expression plot by cluster
features <- c("POU5F1", 
              "L1TD1", 
              "SOX2", 
              "CRABP1", 
              "HHEX", 
              "FOXA2", 
              "AFP", 
              "KDR", 
              "CDH5", 
              "PECAM1", 
              "NFATC1", 
              "TBX5", 
              "TBX20", 
              "WT1", 
              "TBX18", 
              "BMP4", 
              "COL3A1", 
              "LUM", 
              "POSTN", 
              "EYA1", 
              "ISL1", 
              "FGF10",
              "TNNT2", 
              "ACTN2", 
              "RYR2", 
              "CACNA1C", 
              "TNNI3")

p <- DotPlot(guided_obj_sct, features = features, col.min = 0, 
             cols = c("light gray", "dark red")) + 
  theme_linedraw() + 
  theme(
    legend.direction = "horizontal",
    legend.position = "bottom",
    legend.title = element_text(size = 9),
    legend.text = element_text(size = 9),
    axis.title.y = element_blank(), 
    axis.title.x = element_blank(), 
    axis.text.x = element_text(angle = 60, vjust = 0, hjust = 0),
    axis.text.y = element_text(colour = umap_colors, face = "bold")
  ) +
  scale_x_discrete(position = "top")

print(p)

# Subset guided_obj_sct for specific individuals
NA19114_obj_sct <- subset(guided_obj_sct, 
                          subset = vireo.individual == "NA19114")

NA19130_obj_sct <- subset(guided_obj_sct, 
                          subset = vireo.individual == "NA19130")

NA19152_obj_sct <- subset(guided_obj_sct, 
                          subset = vireo.individual == "NA19152")
# UMAP plot for cell line NA19114
NA19114_umap <- DimPlot(NA19114_obj_sct, reduction = "umap", 
                        cols = umap_colors) + NoLegend() +
  theme(
    axis.title.y = element_text(size = 8),
    axis.ticks.y = element_blank(),
    axis.text.y = element_blank(),
    axis.title.x = element_text(size = 8),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    panel.background = element_blank()
  ) +
  xlab("UMAP 1") + ylab("UMAP 2")

# Bar plot for cell type percentages in NA19114
NA19114_totals <- as.data.frame(table(NA19114_obj_sct@active.ident)) %>%
  mutate(Percent = scales::label_percent(accuracy = .1)(Freq / sum(Freq))) %>%
  rename(Type = Var1, Count = Freq) %>%
  mutate(Type = factor(Type, levels = c(
    "Hepatic endoderm",
    "Foregut endoderm",
    "Pluripotent stem cells",
    "Neuroectoderm",
    "Endocardial cells",
    "Second heart field",
    "Cardiomyocytes",
    "First heart field",
    "Epicardial cells",
    "Fibroblasts"
  )))

NA19114_bar <- ggplot(NA19114_totals, aes(x = Type, y = Count, fill = Type)) +
  geom_bar(stat = "identity") +
  coord_flip(clip = "off") +
  geom_text(aes(label = Percent), size = 2, hjust = -0.01) +
  scale_fill_manual(values = bar_colors) +
  NoLegend() +
  theme(
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y = element_text(size = 9),
    axis.text.x = element_text(size = 8),
    panel.background = element_blank()
  )

# Combine UMAP and bar plot
p <- NA19114_umap + NA19114_bar + plot_layout(design = umap_layout)

print(p)

# UMAP plot for cell line NA19130
NA19130_umap <- DimPlot(NA19130_obj_sct, reduction = "umap", 
                        cols = umap_colors) + NoLegend() +
  theme(
    axis.title.y = element_text(size = 8),
    axis.ticks.y = element_blank(),
    axis.text.y = element_blank(),
    axis.title.x = element_text(size = 8),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    panel.background = element_blank()
  ) +
  xlab("UMAP 1") + ylab("UMAP 2")

# Bar plot for cell type percentages in NA19130
NA19130_totals <- as.data.frame(table(NA19130_obj_sct@active.ident)) %>%
  mutate(Percent = scales::label_percent(accuracy = .1)(Freq / sum(Freq))) %>%
  rename(Type = Var1, Count = Freq) %>%
  mutate(Type = factor(Type, levels = c(
    "Hepatic endoderm",
    "Foregut endoderm",
    "Pluripotent stem cells",
    "Neuroectoderm",
    "Endocardial cells",
    "Second heart field",
    "Cardiomyocytes",
    "First heart field",
    "Epicardial cells",
    "Fibroblasts"
  )))

NA19130_bar <- ggplot(NA19130_totals, aes(x = Type, y = Count, fill = Type)) +
  geom_bar(stat = "identity") +
  coord_flip(clip = "off") +
  geom_text(aes(label = Percent), size = 2, hjust = -0.01) +
  scale_fill_manual(values = bar_colors) +
  NoLegend() +
  theme(
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y = element_text(size = 9),
    axis.text.x = element_text(size = 8),
    panel.background = element_blank()
  )

# Combine UMAP and bar plot
p <- NA19130_umap + NA19130_bar + plot_layout(design = umap_layout)

print(p)

# UMAP plot for cell line NA19152
NA19152_umap <- DimPlot(NA19152_obj_sct, reduction = "umap", 
                        cols = umap_colors) + NoLegend() +
  theme(
    axis.title.y = element_text(size = 8, color = "black"),
    axis.ticks.y = element_blank(),
    axis.text.y = element_blank(),
    axis.title.x = element_text(size = 8, colour = "black"),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    panel.background = element_blank()
  ) +
  xlab("UMAP 1") + ylab("UMAP 2")

# Bar plot for cell type percentages in NA19152
NA19152_totals <- as.data.frame(table(NA19152_obj_sct@active.ident)) %>%
  mutate(Percent = scales::label_percent(accuracy = .1)(Freq / sum(Freq))) %>%
  rename(Type = Var1, Count = Freq) %>%
  mutate(Type = factor(Type, levels = c(
    "Hepatic endoderm",
    "Foregut endoderm",
    "Pluripotent stem cells",
    "Neuroectoderm",
    "Endocardial cells",
    "Second heart field",
    "Cardiomyocytes",
    "First heart field",
    "Epicardial cells",
    "Fibroblasts"
  )))

NA19152_bar <- ggplot(NA19152_totals, aes(x = Type, y = Count, fill = Type)) +
  geom_bar(stat = "identity") +
  coord_flip(clip = "off") +
  geom_text(aes(label = Percent), size = 2, hjust = -0.01) +
  scale_fill_manual(values = bar_colors) +
  NoLegend() +
  theme(
    axis.title.y = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y = element_text(size = 9, color = "black"),
    axis.text.x = element_text(size = 8, colour = "black"),
    panel.background = element_blank()
  )

# Combine UMAP and bar plot
p <- NA19152_umap + NA19152_bar + plot_layout(design = umap_layout) 
print(p)

# Subset foregut populations for supplemental materials
gut <- subset(guided_obj_sct, subset = seurat_clusters == 4)

# Apply SCTransform, PCA, UMAP, FindNeighbors, and FindClusters
gut <- SCTransform(gut, variable.features.n = 5000, verbose = FALSE)
gut <- RunPCA(gut, npcs = 50, verbose = FALSE)
gut <- RunUMAP(gut, dims = 1:50, verbose = FALSE)
gut <- FindNeighbors(gut, dims = 1:50, verbose = FALSE)
gut <- FindClusters(gut, resolution = 0.1, verbose = FALSE)

# Assign cluster names
gut_cluster_ids <- c("Anterior foregut", "Posterior foregut")    
names(gut_cluster_ids) <- levels(gut)
gut <- RenameIdents(gut, gut_cluster_ids)

# UMAP plot for gut data
p <- DimPlot(gut, reduction = "umap", cols = c("#0077BB", "#EE7733")) +
  theme(
    axis.title.y = element_text(size = 8, color = "black"),
    axis.ticks.y = element_blank(),
    axis.text.y = element_blank(),
    axis.title.x = element_text(size = 8, colour = "black"),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    panel.background = element_blank()
  ) +
  xlab("UMAP 1") + ylab("UMAP 2")  + coord_fixed(ratio = 1)

print(p)

# Violin plots for marker genes for anterior and posterior foregut
SOX2 <- VlnPlot(gut, "SOX2", cols = c("#0077BB", "#EE7733")) + NoLegend() +
  theme(axis.title.x = element_blank(), axis.text.x = element_text(size = 10), 
        axis.title.y = element_text(size = 12))

ISL1 <- VlnPlot(gut, "ISL1", cols = c("#0077BB", "#EE7733")) + NoLegend() +
  theme(axis.title.x = element_blank(), axis.text.x = element_text(size = 10), 
        axis.title.y = element_blank())

IRX3 <- VlnPlot(gut, "IRX3", cols = c("#0077BB", "#EE7733")) + NoLegend() +
  theme(axis.title.x = element_blank(), axis.text.x = element_text(size = 10), 
        axis.title.y = element_blank())

HNF4A <- VlnPlot(gut, "HNF4A", cols = c("#0077BB", "#EE7733")) + NoLegend() +
  theme(axis.title.x = element_blank(), axis.text.x = element_text(size = 10), 
        axis.title.y = element_text(size = 12))

PROX1 <- VlnPlot(gut, "PROX1", cols = c("#0077BB", "#EE7733")) + NoLegend() +
  theme(axis.title.x = element_blank(), axis.text.x = element_text(size = 10), 
        axis.title.y = element_blank())

FGB <- VlnPlot(gut, "FGB", cols = c("#0077BB", "#EE7733")) + NoLegend() +
  theme(axis.title.x = element_blank(), axis.text.x = element_text(size = 10), 
        axis.title.y = element_blank())

# Layout setup for multiple plots
gut_layout <- c(area(1, 1), area(1, 2), area(1, 3))
# Combine violin plots of anterior foregut markers into one plot
p <- SOX2 + ISL1 + IRX3 + plot_layout(design = gut_layout)

print(p)

# Combine violin plots of posterior foregut markers into one plot
p <- HNF4A + PROX1 + FGB + plot_layout(design = gut_layout)

print(p)

FIGURE 2 PLOTTING OF GUIDED DIFFERENTIATION AND 16-DAY TIME-COURSE INTEGRATION

# Batch processing job due to high computational requirement
# Not run as part of this Rmarkdown

# Make the guided differentiation object
# Read in 10x data
guided_raw_data <- Read10X("guided_differentiation_2021/220510_A00639_1151_BH3YKKDRX2-YG-EM-17S-lns12/YG-EM-ln1-EM314_human/outs/filtered_feature_bc_matrix/")

# Create Seurat object
guided_obj <- CreateSeuratObject(counts = guided_raw_data, min.cells = 1)

# Add Vireo information (demultiplex to identify individuals)
add_vireo <- function(guided_obj, dir) {
  donor_ids <- read.table(paste0(dir, 'vireo/donor_ids.tsv'), 
                          header = TRUE, stringsAsFactors = FALSE)
  
  guided_obj <- AddMetaData(guided_obj, donor_ids$donor_id,   
                            col.name ='vireo.individual')
  guided_obj <- AddMetaData(guided_obj, donor_ids$prob_max,   
                            col.name ='vireo.prob.singlet')
  
  ambient.file <- paste0(dir, 'vireo/prop_ambient.tsv')
  if (file.exists(ambient.file)) {
    prop <- read.table(paste0(dir, 'vireo/prop_ambient.tsv'), 
                       header = TRUE, stringsAsFactors = FALSE)
    
    inds <- unique(donor_ids$best_singlet)
    donor_ids$prop_donor <- 0
    for (i in 1:length(inds)) {
      w <- which(donor_ids$best_singlet==inds[i])
      donor_ids$prop_donor[w] <- prop[w,inds[i]]
    }
    guided_obj <- AddMetaData(guided_obj, donor_ids$prop_donor, 
                              col.name ='vireo.prop.donor')
  }
  guided_obj
}

guided_obj <- add_vireo(guided_obj, dir)

individual.colname <- ('vireo.individual')
individual.colname <- individual.colname[which(individual.colname %in% 
                                                 colnames(guided_obj@meta.data))][1]

# Filter out doublets, unassigned cells, and cells with fewer than 1500 features
guided_obj <- subset(guided_obj, 
                     subset = vireo.individual != "doublet" & 
                       vireo.individual != "unassigned" & 
                       nFeature_RNA >= 1500)

# Make the time-course object
# Create list of directories containing DropSeq data
round1_dirs <- list.dirs(path = "dropseqRunner_round1/output", 
                         full.names = TRUE, 
                         recursive = TRUE)
round2_dirs <- list.dirs(path = "dropseqRunner_round2/output", 
                         full.names = TRUE, 
                         recursive = TRUE)
round3_dirs <- list.dirs(path = "dropseqRunner_round3/output", 
                         full.names = TRUE, 
                         recursive = TRUE)

# Combine directory lists
combined_dirs <- c(round1_dirs, round2_dirs, round3_dirs)
combined_dirs <- grep("*/Gene/raw", combined_dirs, value = TRUE)
combined_dirs <- as.data.frame(combined_dirs)

# Read in 10x data
rows <- nrow(combined_dirs)
mix_list <- list()
obj_list <- list()

for (i in 1:rows) {
  path <- as.character(combined_dirs[i, ])
  timecourse_raw_data <- Read10X(path)
  
  mixture <- str_match(path, "output/\\s*(.*?)\\s*_0_Solo")[, 2]
  mix_list[[i]] <- mixture
  
  timecourse_obj <- CreateSeuratObject(counts = timecourse_raw_data, 
                                       project = mixture, 
                                       assay = "RNA")
  obj_list[[i]] <- timecourse_obj
}

# Merge individual time-course objects
timecourse_obj <- merge(obj_list[[1]], 
                        y = obj_list[-1], 
                        add.cell.ids = mix_list)

# Only include genes expressed in ≥10 cells and cells expressing ≥300 genes
timecourse_obj <- CreateSeuratObject(counts = timecourse_obj@assays$RNA@counts, 
                                     min.cells = 10, 
                                     min.features = 300)

# Calculate the percentage of mitochondrial DNA and add as metadata
timecourse_obj[["percent.mt"]] <- PercentageFeatureSet(timecourse_obj, 
                                                       pattern = "^MT-")

#  Subset to exclude cells with more than 25% mitochondrial DNA content
timecourse_obj <- subset(timecourse_obj, subset = percent.mt <= 25)

# Prepare demuxlet files for integration
# List all demuxlet .best files
demux_files <- list.files(path="demux", 
                          pattern = "*_demux.best", 
                          full.names=TRUE)

# Initialize list for demuxlet dataframes
demux_list <- list()

# Loop to process each demuxlet file
for (i in 1:length(demux_files)) {
  # Read each demuxlet file
  demux_data <- fread(demux_files[[i]])

  # Extract mixture name from file path
  demux_name <- str_match(demux_files[[i]], "demux/\\s*(.*?)\\s*_demux")[,2]

  # Add mixture name as new column and prefix it to barcodes
  demux_data$orig.ident <- demux_name
  demux_data$BARCODE <- paste0(demux_name, "_", demux_data$BARCODE)

  # Add processed dataframe to the list
  demux_list[[i]] <- demux_data
}

# Merge all demuxlet dataframes into a single dataframe
demux_combined <- bind_rows(demux_list)

# Exclude cells with doublet probability greater than 0.3
demux_combined <- filter(demux_combined, PRB.DBL <= 0.3)
# Remove ambiguous cells
demux_combined <- filter(demux_combined, !grepl('AMB', BEST))

# Prepare collection data
# Read collection day dataframe
collect_data <- read.table(file = 'AllroundsCollectionInfo.tsv', 
                           sep = '\t', 
                           header = TRUE)

# Add a column for the full mixture ID name
collect_data <- mutate(collect_data, 
                       Mixture = paste0("E", collect_data$Exp, 
                                        "CD", collect_data$CD, 
                                        "col", collect_data$col))

# Subset to include relevant columns
collect_data <- subset(collect_data, select = c("Line", "Day", "Mixture"))

# Add collection day to demuxlet data
# Merge collection data with demuxlet data based on mixture and cell line
sample_info <- merge(demux_combined, collect_data, 
                     by.x=c('orig.ident', 'SNG.1ST'), 
                     by.y=c('Mixture', 'Line'), 
                     all = TRUE)

# Rename Demuxlet best singlet guess column to individual/cell line
sample_info <- sample_info %>% rename(Individual = SNG.1ST)

# Add sample information as metadata to object
# Set barcode column as row names for sample info
sample_coded <- column_to_rownames(sample_info, var = "BARCODE")

# Add sample metadata to time-course object
timecourse_obj <- AddMetaData(object = timecourse_obj, metadata = sample_coded)

# Remove cells not designated as singlets
timecourse_obj <- subset(timecourse_obj, subset = Individual != "NA")

# Filter cells based on feature and read counts
# Calculate upper and lower bounds for counts and features
count_max <- round(median(timecourse_obj$nCount_RNA) + 
                     4 * sd(timecourse_obj$nCount_RNA), digits = -2)

count_min <- max(0, round(median(timecourse_obj$nCount_RNA) - 
                            4 * sd(timecourse_obj$nCount_RNA), digits = -2))

feat_max <- round(median(timecourse_obj$nFeature_RNA) + 
                    4 * sd(timecourse_obj$nFeature_RNA), digits = -2)

feat_min <- max(0, round(median(timecourse_obj$nFeature_RNA) - 
                           4 * sd(timecourse_obj$nFeature_RNA), digits = -2))

# Filter cells that have features or reads above or below 4 s.d. from the median
timecourse_obj <- subset(timecourse_obj, 
                         subset = nFeature_RNA > feat_min & 
                           nFeature_RNA < feat_max & 
                           nCount_RNA < count_max & 
                           nCount_RNA > count_min)

# Combine and format objects
# Add origin metadata to each object
timecourse_obj <- AddMetaData(timecourse_obj, 
                              metadata="Time-course", 
                              col.name="Origin")

guided_obj <- AddMetaData(guided_obj, 
                          metadata="Guided differentiation", 
                          col.name="Origin")

# Merge objects and preface cell names with origin identifier
combined_obj <- merge(timecourse_obj, 
                      y = guided_obj, 
                      add.cell.ids = c("TC", "EB"), project = "compare")

# Replace missing 'Day' metadata in guided differentiation with 'EB'
combined_obj@meta.data$Day <- replace_na(combined_obj@meta.data$Day, 'EB')

# Process for SCTransform and integration
combined_obj <- SplitObject(combined_obj, split.by = "Origin")
combined_obj <- lapply(combined_obj, SCTransform, variable.features.n = 5000)
features <- SelectIntegrationFeatures(object.list = combined_obj, 
                                      nfeatures = 5000)
combined_obj <- PrepSCTIntegration(object.list = combined_obj, 
                                   anchor.features = features)
anchors <- FindIntegrationAnchors(object.list = combined_obj, 
                                  normalization.method = "SCT", 
                                  anchor.features = features)
combined_sct <- IntegrateData(anchorset = anchors, 
                              normalization.method = "SCT")
DefaultAssay(combined_sct) <- "integrated"
combined_sct <- RunPCA(combined_sct, npcs = 50)
combined_sct <- RunUMAP(combined_sct, dims = 1:50)
combined_sct <- FindNeighbors(combined_sct, dims = 1:50)

# Reorder day levels chronologically in combined_sct
combined_sct$Day <- factor(combined_sct$Day, 
                           levels = c('Day 0', 
                                      'Day 1', 
                                      'Day 3', 
                                      'Day 5', 
                                      'Day 7', 
                                      'Day 11', 
                                      'Day 15', 
                                      'EB'))

# Find clusters and prepare for differential expression analysis
int_obj <- FindClusters(combined_sct, resolution = 0.1)
int_obj <- PrepSCTFindMarkers(int_obj)
# Define a custom palette
my_seven_colors <- c("#117733",  # Green
                     "#882255",  # Wine
                     "#CC6677",  # Rose
                     "#999933",  # Olive
                     "#44AA99",  # Teal
                     "#332288",  # Indigo
                     "#AA4499")  # Purple

# Plot UMAP of the integrated object and split by origin
DimPlot(int_obj, group.by = 'Origin', cols = c("orange", "light blue"), 
        order = FALSE, raster = FALSE) + 
  theme(
    axis.title.y = element_text(size = 8, color = "black"),
    axis.ticks.y = element_blank(),
    axis.text.y = element_blank(),
    axis.title.x = element_text(size = 8, colour = "black"),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    panel.background = element_blank()
  ) +
  xlab("UMAP 1") + ylab("UMAP 2") + NoLegend()

cells0_0 <- Cells(subset(x = int_obj, subset = Day == "Day 0" & seurat_clusters == 0))
cells0_1 <- Cells(subset(x = int_obj, subset = Day == "Day 0" & seurat_clusters == 1))
cells0_2 <- Cells(subset(x = int_obj, subset = Day == "Day 0" & seurat_clusters == 2))
cells0_3 <- Cells(subset(x = int_obj, subset = Day == "Day 0" & seurat_clusters == 3))
cells0_4 <- Cells(subset(x = int_obj, subset = Day == "Day 0" & seurat_clusters == 4))
cells0_5 <- Cells(subset(x = int_obj, subset = Day == "Day 0" & seurat_clusters == 5))
#cells0_6 <- Cells(subset(x = int_obj, subset = Day == "Day 0" & seurat_clusters == 6)) # no cells

day0 <- DimPlot(object = int_obj, cells.highlight = list(cells0_0, cells0_1, cells0_2, cells0_3, cells0_4, cells0_5), cols.highlight = c("#332288", "#44AA99", "#999933", "#CC6677", "#882255", "#117733"), cols = "#DDDDDD", order = TRUE, label = FALSE, raster = TRUE, raster.dpi = c(1024, 1024)) +
  ggtitle("Time-course (day 0)") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 11),
    panel.border = element_rect_round(radius = unit(0.1, "snpc"), color = "black", fill = NA)
  ) + NoLegend() + NoAxes()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
print(day0)

# Highlight time-course day 1 clusters
cells1_0 <- Cells(subset(x = int_obj, subset = Day == "Day 1" & seurat_clusters == 0))
cells1_1 <- Cells(subset(x = int_obj, subset = Day == "Day 1" & seurat_clusters == 1))
cells1_2 <- Cells(subset(x = int_obj, subset = Day == "Day 1" & seurat_clusters == 2))
cells1_3 <- Cells(subset(x = int_obj, subset = Day == "Day 1" & seurat_clusters == 3))
cells1_4 <- Cells(subset(x = int_obj, subset = Day == "Day 1" & seurat_clusters == 4))
cells1_5 <- Cells(subset(x = int_obj, subset = Day == "Day 1" & seurat_clusters == 5))
#cells1_6 <- Cells(subset(x = int_obj, subset = Day == "Day 1" & seurat_clusters == 6)) # no cells

day1 <- DimPlot(object = int_obj, cells.highlight = list(cells1_0, cells1_1, cells1_2, cells1_3, cells1_4, cells1_5), cols.highlight = c("#332288", "#44AA99", "#999933", "#CC6677", "#882255", "#117733"), cols = "#DDDDDD", order = TRUE, label = FALSE, raster = TRUE, raster.dpi = c(1024, 1024)) +
  ggtitle("Time-course (day 1)") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 11),
    panel.border = element_rect_round(radius = unit(0.1, "snpc"), color = "black", fill = NA)
  ) + NoLegend() + NoAxes()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
print(day1)

# Highlight time-course day 3 clusters
cells3_0 <- Cells(subset(x = int_obj, subset = Day == "Day 3" & seurat_clusters == 0))
cells3_1 <- Cells(subset(x = int_obj, subset = Day == "Day 3" & seurat_clusters == 1))
cells3_2 <- Cells(subset(x = int_obj, subset = Day == "Day 3" & seurat_clusters == 2))
cells3_3 <- Cells(subset(x = int_obj, subset = Day == "Day 3" & seurat_clusters == 3))
cells3_4 <- Cells(subset(x = int_obj, subset = Day == "Day 3" & seurat_clusters == 4))
cells3_5 <- Cells(subset(x = int_obj, subset = Day == "Day 3" & seurat_clusters == 5))
#cells3_6 <- Cells(subset(x = int_obj, subset = Day == "Day 3" & seurat_clusters == 6)) # no cells

day3 <- DimPlot(object = int_obj, cells.highlight = list(cells3_0, cells3_1, cells3_2, cells3_3, cells3_4, cells3_5), cols.highlight = c("#332288", "#44AA99", "#999933", "#CC6677", "#882255", "#117733"), cols = "#DDDDDD", order = TRUE, label = FALSE, raster = TRUE, raster.dpi = c(1024, 1024)) +
  ggtitle("Time-course (day 3)") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 11),
    panel.border = element_rect_round(radius = unit(0.1, "snpc"), color = "black", fill = NA)
  ) + NoLegend() + NoAxes()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
print(day3)

# Highlight time-course day 5 clusters
cells5_0 <- Cells(subset(x = int_obj, subset = Day == "Day 5" & seurat_clusters == 0))
cells5_1 <- Cells(subset(x = int_obj, subset = Day == "Day 5" & seurat_clusters == 1))
cells5_2 <- Cells(subset(x = int_obj, subset = Day == "Day 5" & seurat_clusters == 2))
cells5_3 <- Cells(subset(x = int_obj, subset = Day == "Day 5" & seurat_clusters == 3))
cells5_4 <- Cells(subset(x = int_obj, subset = Day == "Day 5" & seurat_clusters == 4))
cells5_5 <- Cells(subset(x = int_obj, subset = Day == "Day 5" & seurat_clusters == 5))
cells5_6 <- Cells(subset(x = int_obj, subset = Day == "Day 5" & seurat_clusters == 6))

day5 <- DimPlot(object = int_obj, cells.highlight = list(cells5_0, cells5_1, cells5_2, cells5_3, cells5_4, cells5_5, cells5_6), cols.highlight = c("#AA4499", "#332288", "#44AA99", "#999933", "#CC6677", "#882255", "#117733"), cols = "#DDDDDD", order = TRUE, label = FALSE, raster = TRUE, raster.dpi = c(1024, 1024)) +
  ggtitle("Time-course (day 5)") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 11),
    panel.border = element_rect_round(radius = unit(0.1, "snpc"), color = "black", fill = NA)
  ) + NoLegend() + NoAxes()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
print(day5)

# Highlight time-course day 7 clusters
cells7_0 <- Cells(subset(x = int_obj, subset = Day == "Day 7" & seurat_clusters == 0))
cells7_1 <- Cells(subset(x = int_obj, subset = Day == "Day 7" & seurat_clusters == 1))
cells7_2 <- Cells(subset(x = int_obj, subset = Day == "Day 7" & seurat_clusters == 2))
cells7_3 <- Cells(subset(x = int_obj, subset = Day == "Day 7" & seurat_clusters == 3))
cells7_4 <- Cells(subset(x = int_obj, subset = Day == "Day 7" & seurat_clusters == 4))
cells7_5 <- Cells(subset(x = int_obj, subset = Day == "Day 7" & seurat_clusters == 5))
cells7_6 <- Cells(subset(x = int_obj, subset = Day == "Day 7" & seurat_clusters == 6))

day7 <- DimPlot(object = int_obj, cells.highlight = list(cells7_0, cells7_1, cells7_2, cells7_3, cells7_4, cells7_5, cells7_6), cols.highlight = c("#AA4499", "#332288", "#44AA99", "#999933", "#CC6677", "#882255", "#117733"), cols = "#DDDDDD", order = TRUE, label = FALSE, raster = TRUE, raster.dpi = c(1024, 1024)) +
  ggtitle("Time-course (day 7)") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 11),
    panel.border = element_rect_round(radius = unit(0.1, "snpc"), color = "black", fill = NA)
  ) + NoLegend() + NoAxes()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
print(day7)

# Highlight time-course day 11 clusters
cells11_0 <- Cells(subset(x = int_obj, subset = Day == "Day 11" & seurat_clusters == 0))
cells11_1 <- Cells(subset(x = int_obj, subset = Day == "Day 11" & seurat_clusters == 1))
cells11_2 <- Cells(subset(x = int_obj, subset = Day == "Day 11" & seurat_clusters == 2))
cells11_3 <- Cells(subset(x = int_obj, subset = Day == "Day 11" & seurat_clusters == 3))
cells11_4 <- Cells(subset(x = int_obj, subset = Day == "Day 11" & seurat_clusters == 4))
cells11_5 <- Cells(subset(x = int_obj, subset = Day == "Day 11" & seurat_clusters == 5))
cells11_6 <- Cells(subset(x = int_obj, subset = Day == "Day 11" & seurat_clusters == 6))

day11 <- DimPlot(object = int_obj, cells.highlight = list(cells11_0, cells11_1, cells11_2, cells11_3, cells11_4, cells11_5, cells11_6), cols.highlight = c("#AA4499", "#332288", "#44AA99", "#999933", "#CC6677", "#882255", "#117733"), cols = "#DDDDDD", order = TRUE, label = FALSE, raster = TRUE, raster.dpi = c(1024, 1024)) +
  ggtitle("Time-course (day 11)") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 11),
    panel.border = element_rect_round(radius = unit(0.1, "snpc"), color = "black", fill = NA)
  ) + NoLegend() + NoAxes()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
print(day11)

# Highlight time-course day 15 clusters
cells15_0 <- Cells(subset(x = int_obj, subset = Day == "Day 15" & seurat_clusters == 0))
cells15_1 <- Cells(subset(x = int_obj, subset = Day == "Day 15" & seurat_clusters == 1))
cells15_2 <- Cells(subset(x = int_obj, subset = Day == "Day 15" & seurat_clusters == 2))
cells15_3 <- Cells(subset(x = int_obj, subset = Day == "Day 15" & seurat_clusters == 3))
cells15_4 <- Cells(subset(x = int_obj, subset = Day == "Day 15" & seurat_clusters == 4))
cells15_5 <- Cells(subset(x = int_obj, subset = Day == "Day 15" & seurat_clusters == 5))
cells15_6 <- Cells(subset(x = int_obj, subset = Day == "Day 15" & seurat_clusters == 6))

day15 <- DimPlot(object = int_obj, cells.highlight = list(cells15_0, cells15_1, cells15_2, cells15_3, cells15_4, cells15_5, cells15_6), cols.highlight = c("#AA4499", "#332288", "#44AA99", "#999933", "#CC6677", "#882255", "#117733"), cols = "#DDDDDD", order = TRUE, label = FALSE, raster = TRUE, raster.dpi = c(1024, 1024)) +
  ggtitle("Time-course (day 15)") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 11),
    panel.border = element_rect_round(radius = unit(0.1, "snpc"), color = "black", fill = NA)
  ) + NoLegend() + NoAxes()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
print(day15)

# Highlight guided differentiation ('EB') clusters
cellsEB_0 <- Cells(subset(x = int_obj, subset = Day == "EB" & seurat_clusters == 0))
cellsEB_1 <- Cells(subset(x = int_obj, subset = Day == "EB" & seurat_clusters == 1))
cellsEB_2 <- Cells(subset(x = int_obj, subset = Day == "EB" & seurat_clusters == 2))
cellsEB_3 <- Cells(subset(x = int_obj, subset = Day == "EB" & seurat_clusters == 3))
cellsEB_4 <- Cells(subset(x = int_obj, subset = Day == "EB" & seurat_clusters == 4))
cellsEB_5 <- Cells(subset(x = int_obj, subset = Day == "EB" & seurat_clusters == 5))
cellsEB_6 <- Cells(subset(x = int_obj, subset = Day == "EB" & seurat_clusters == 6))

EB <- DimPlot(object = int_obj, cells.highlight = list(cellsEB_0, cellsEB_1, cellsEB_2, cellsEB_3, cellsEB_4, cellsEB_5, cellsEB_6), cols.highlight = c("#AA4499", "#332288", "#44AA99", "#999933", "#CC6677", "#882255", "#117733"), cols = "#DDDDDD", order = TRUE, label = FALSE, raster = TRUE, raster.dpi = c(1024, 1024)) +
  ggtitle("Guided differentiation (single collection)") +
  theme(
    plot.title = element_text(hjust = 0.5, size = 11),
    panel.border = element_rect_round(radius = unit(0.1, "snpc"), color = "black", fill = NA)
  ) + NoLegend() + NoAxes()
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
print(EB)

# Setup layout for printing multiple plots
p1 <- ggplot(mtcars) + geom_point(aes(mpg, disp))
p2 <- ggplot(mtcars) + geom_boxplot(aes(gear, disp, group = gear))
p3 <- ggplot(mtcars) + geom_bar(aes(gear)) + facet_wrap(~cyl)

# Define the layout for UMAP plots
umap_layout <- c(area(1, 1), area(1, 2), area(1, 3), area(2, 1), area(2, 2), area(2, 3), area(3, 1), area(3, 2, 4, 3))

# Plot each time-course day and the guided differentiation
p <- day0 + day1 + day3 + day5 + day7 + day11 + day15 + EB + plot_layout(design = umap_layout)

print(p)

# Plot UMAP for the entire integrated dataset
p <- DimPlot(int_obj, reduction = "umap", cols = my_seven_colors, order = FALSE, raster = TRUE, raster.dpi = c(1024, 1024)) + 
  theme(
    axis.title.y = element_text(size = 8, color = "black"),
    axis.ticks.y = element_blank(),
    axis.text.y = element_blank(),
    axis.title.x = element_text(size = 8, colour = "black"),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    panel.background = element_blank()
  ) +
  xlab("UMAP 1") + ylab("UMAP 2")
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
print(p)

# While UMAP embedding and clustering are done with the integrated assay, the corrected values are no longer reliable as the quantitative measure of gene expression
# For performing differential expression after integration, switch back to the original data
DefaultAssay(int_obj) <- "SCT"

# Plot marker gene expression by cluster, split by origin
features <- c("KDR", 
              "APOA1", 
              "FOXA2", 
              "CRABP2", 
              "POU5F1", 
              "CCND1", 
              "LUM",
              "COL3A1", 
              "TNNT2", 
              "TTN")

p <- DotPlot(int_obj, features = features, cols = c("#0077BB", "#EE7733"), 
             split.by = "Origin", dot.scale = 8) +
  coord_flip() +
  theme(
    axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
    axis.title.y = element_blank(),
    axis.title.x = element_blank()
  )

print(p)

FIGURE 3 PLOTTING OF SCPRED / AUTOMATED CELL ANNOTATION

# Read in reference data from Miao 2020
miao_meta_data <- read.table(file = "integration_analyses/Miao_2020/metadata.txt", 
                             sep = '\t', header = TRUE)
miao_raw_data <- read.table(file = "integration_analyses/Miao_2020/expression.txt", 
                            sep = '\t', header = TRUE)

# Remove first row of metadata (dataclass of each column)
miao_meta_data <- miao_meta_data[-1,]

# Set first column as row names
miao_meta_fix <- miao_meta_data[,-1]
rownames(miao_meta_fix) <- miao_meta_data[,1]
miao_raw_fix <- miao_raw_data[,-1]
rownames(miao_raw_fix) <- miao_raw_data[,1]

# Create Seurat object
miao_reference <- CreateSeuratObject(counts = miao_raw_fix, 
                                     meta.data = miao_meta_fix, 
                                     min.cells = 1)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
# Correct spelling in metadata
miao_reference@meta.data$Cell_type <- gsub("Epicarduim", "Epicardium", 
                                           miao_reference@meta.data$Cell_type)
# Add origin info to reference object metadata
miao_reference <- AddMetaData(miao_reference, 
                              metadata="Miao_fetal_heart", 
                              col.name="Origin")

# Apply SCTransform, PCA, UMAP, and FindNeighbors
miao_reference <- SCTransform(miao_reference, 
                              variable.features.n = 5000, 
                              verbose = FALSE)
miao_reference <- RunPCA(miao_reference, npcs = 50, verbose = FALSE) 
miao_reference <- RunUMAP(miao_reference, dims = 1:50, verbose = FALSE, 
                          return.model = TRUE)
miao_reference <- FindNeighbors(miao_reference, dims = 1:50, verbose = FALSE)

# Train classifiers with scPred
miao_reference <- getFeatureSpace(miao_reference, "Cell_type")
## ●  Extracting feature space for each cell type...
## DONE!
miao_reference <- trainModel(miao_reference)
## ●  Training models for each cell type...
## DONE!
# Read in guided differentiation culture
guided_obj <- CreateSeuratObject(Read10X("guided_differentiation_2021/220510_A00639_1151_BH3YKKDRX2-YG-EM-17S-lns12/YG-EM-ln1-EM314_human/outs/filtered_feature_bc_matrix/"), min.cells = 1)

# Read in day 100 multi-lineage organoid from Silva 2021
silva_obj <- CreateSeuratObject(Read10X("integration_analyses/Silva_2021/filtered_feature_bc_matrix/"), min.cells = 1)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
# Read in PBMCs (negative control)
pbmc_data <- CreateSeuratObject(Read10X("integration_analyses/20k_pbmc/filtered_feature_bc_matrix/"), min.cells = 1)
# Create Seurat object from raw data
guided_obj <- CreateSeuratObject(guided_raw_data, min.cells = 1)

# Function to add Vireo metadata
add_vireo_metadata <- function(guided_obj, dir) {
  donor_ids_path <- paste0(dir, 'vireo/donor_ids.tsv')
  ambient_file_path <- paste0(dir, 'vireo/prop_ambient.tsv')
  
  donor_ids <- read.table(donor_ids_path,
                          header = TRUE, 
                          stringsAsFactors = FALSE)
  
  guided_obj <- AddMetaData(guided_obj, 
                            donor_ids$donor_id, 'vireo.individual')
  
  guided_obj <- AddMetaData(guided_obj, 
                            donor_ids$prob_max, 'vireo.prob.singlet')
  
  if (file.exists(ambient_file_path)) {
    prop <- read.table(ambient_file_path, header = TRUE, 
                       stringsAsFactors = FALSE)
    inds <- unique(donor_ids$best_singlet)
    donor_ids$prop_donor <- rep(0, nrow(donor_ids))
    for (i in inds) {
      donor_ids$prop_donor[donor_ids$best_singlet == i] <- prop[inds == i]
    }
    guided_obj <- AddMetaData(guided_obj, 
                              donor_ids$prop_donor, 'vireo.prop.donor')
  }
  guided_obj
}

# Apply function to Seurat object
guided_obj <- add_vireo_metadata(guided_obj, dir)

# Define individual column name
individual_colname <- 'vireo.individual'

# Filter guided obj for assigned singlets containing ≥1,500 features per cell
guided_obj <- subset(guided_obj, 
                     subset = vireo.individual != "doublet" & 
                       vireo.individual != "unassigned" & 
                       nFeature_RNA >= 1500)

# Apply SCTransform and add metadata to guided obj
guided_obj <- SCTransform(guided_obj, variable.features.n = 5000, 
                          verbose = FALSE)
guided_obj <- AddMetaData(guided_obj, 
                          metadata = "guided_obj", 
                          col.name = "Origin")
# Filter Silva obj based on # of features, apply SCTransform, add metadata
silva_obj <- subset(silva_obj, subset = nFeature_RNA >= 1500)
silva_obj <- SCTransform(silva_obj, variable.features.n = 5000, verbose = FALSE)
silva_obj <- AddMetaData(silva_obj, 
                         metadata="multi-lineage organoid", 
                         col.name="Origin")

# Apply SCTransform to pbmcs and add metadata
pbmc_data <- SCTransform(pbmc_data, variable.features.n = 5000, verbose = FALSE)
pbmc_data <- AddMetaData(pbmc_data, 
                         metadata="PBMCs", 
                         col.name="Origin")
# Set probability threshold for cell classification
threshold_value <- 0.9

# Classify query cells
guided_obj <- scPredict(guided_obj, miao_reference, threshold = threshold_value)
## ●  Matching reference with new dataset...
##   ─ 5000 features present in reference loadings
##   ─ 3206 features shared between reference and new dataset
##   ─ 64.12% of features in the reference are present in new dataset
## ●  Aligning new data to reference...
## Warning: HarmonyMatrix is deprecated and will be removed in the future from the
## API in the future
## Warning: Warning: The parameters do_pca and npcs are deprecated. They will be ignored for this function call and please remove parameters do_pca and npcs and pass to harmony cell_embeddings directly.
## This warning is displayed once per session.
## Warning: Warning: The parameter max.iter.harmony is replaced with parameter max_iter. It will be ignored for this function call and please use parameter max_iter in future function calls.
## This warning is displayed once per session.
## Transposing data matrix
## Initializing state using k-means centroids initialization
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony 5/10
## Harmony converged after 5 iterations
## ●  Classifying cells...
## DONE!
silva_obj <- scPredict(silva_obj, miao_reference, threshold = threshold_value)
## ●  Matching reference with new dataset...
##   ─ 5000 features present in reference loadings
##   ─ 3371 features shared between reference and new dataset
##   ─ 67.42% of features in the reference are present in new dataset
## ●  Aligning new data to reference...
## Warning: HarmonyMatrix is deprecated and will be removed in the future from the
## API in the future
## Transposing data matrix
## Initializing state using k-means centroids initialization
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony converged after 3 iterations
## ●  Classifying cells...
## DONE!
pbmc_data <- scPredict(pbmc_data, miao_reference, threshold = threshold_value)
## ●  Matching reference with new dataset...
##   ─ 5000 features present in reference loadings
##   ─ 2603 features shared between reference and new dataset
##   ─ 52.06% of features in the reference are present in new dataset
## ●  Aligning new data to reference...
## Warning: HarmonyMatrix is deprecated and will be removed in the future from the
## API in the future
## Transposing data matrix
## Initializing state using k-means centroids initialization
## Harmony 1/10
## Harmony 2/10
## Harmony converged after 2 iterations
## ●  Classifying cells...
## DONE!
# Subset classified cells and plot them (unclassified cells will not be plotted)
sub_guided_obj <- subset(guided_obj, 
                         subset = scpred_prediction == "unassigned", 
                         invert = TRUE)
sub_guided_obj <- RunUMAP(sub_guided_obj, 
                          reduction = "scpred", dims = 1:50, verbose = FALSE)

sub_silva_obj <- subset(silva_obj, 
                        subset = scpred_prediction == "unassigned", 
                        invert = TRUE)
sub_silva_obj <- RunUMAP(sub_silva_obj, 
                         reduction = "scpred", dims = 1:50, verbose = FALSE)

sub_pbmc_data <- subset(pbmc_data, 
                        subset = scpred_prediction == "unassigned", 
                        invert = TRUE)
sub_pbmc_data <- RunUMAP(sub_pbmc_data, 
                         reduction = "scpred", dims = 1:50, verbose = FALSE)
# Define color palette
colors <- brewer.pal(11, 'Paired')

# Plot UMAPs
# colors for UMAP plots
colors <- brewer.pal(n = 11, name = 'Paired')


# Plot UMAPs
ref_plot <- DimPlot(object = miao_reference, reduction = "umap", 
                    group.by = "Cell_type", label = TRUE, repel = TRUE, 
                    cols = c("#A6CEE3", 
                             "#1F78B4", 
                             "#B2DF8A", 
                             "#33A02C", 
                             "#FB9A99", 
                             "#E31A1C", 
                             "#FDBF6F", 
                             "#FF7F00", 
                             "#CAB2D6", 
                             "#6A3D9A", 
                             "#FFFF99")) + 
  theme(axis.title.y = element_text(size = 8, color = "black"),
        axis.ticks.y=element_blank(),
        axis.text.y = element_blank(),
        axis.title.x = element_text(size = 8, colour = "black"),
        axis.text.x = element_blank(),
        axis.ticks.x=element_blank(),
        panel.background = element_blank()) +
        xlab("UMAP 1") + ylab("UMAP 2") +
        ggtitle(NULL) +
        NoLegend()

guided_plot <- DimPlot(sub_guided_obj, 
                       group.by = "scpred_prediction", label = TRUE, repel = TRUE, 
                       cols = c("#A6CEE3", 
                                "#1F78B4", 
                                "#B2DF8A", 
                                "#33A02C", 
                                "#FB9A99", 
                                "#E31A1C", 
                                "#FDBF6F", 
                                "#FF7F00", 
                                "#CAB2D6", 
                                "#6A3D9A")) + 
  theme(axis.title.y = element_text(size = 8, color = "black"),
        axis.ticks.y=element_blank(),
        axis.text.y = element_blank(),
        axis.title.x = element_text(size = 8, colour = "black"),
        axis.text.x = element_blank(),
        axis.ticks.x=element_blank(),
        panel.background = element_blank()) +
        xlab("UMAP 1") + ylab("UMAP 2") +
        ggtitle(NULL) +
        NoLegend()

silva_plot <- DimPlot(sub_silva_obj, 
                      group.by = "scpred_prediction", label = TRUE, repel = TRUE, 
                      cols = c("#A6CEE3", 
                               "#1F78B4", 
                               "#33A02C", 
                               "#E31A1C", 
                               "#FDBF6F", 
                               "#CAB2D6", 
                               "#6A3D9A", 
                               "#FFFF99")) + 
  theme(axis.title.y = element_text(size = 8, color = "black"),
        axis.ticks.y=element_blank(),
        axis.text.y = element_blank(),
        axis.title.x = element_text(size = 8, colour = "black"),
        axis.text.x = element_blank(),
        axis.ticks.x=element_blank(),
        panel.background = element_blank()) +
        xlab("UMAP 1") + ylab("UMAP 2") +
        ggtitle(NULL) +
        NoLegend()
# Setup layout for multiple plots and print
umaps_layout <- c(area(1, 1), area(2, 1), area(2, 2))
p <- ref_plot + guided_plot + silva_plot + plot_layout(design = umaps_layout)

print(p)

# Plot PBMC UMAP for supplemental materials
p <- DimPlot(sub_pbmc_data, 
             group.by = "scpred_prediction", label = TRUE, repel = TRUE, 
             cols = c("#FDBF6F")) + 
  theme(axis.title.y = element_text(size = 8, color = "black"),
        axis.ticks.y=element_blank(),
        axis.text.y = element_blank(),
        axis.title.x = element_text(size = 8, colour = "black"),
        axis.text.x = element_blank(),
        axis.ticks.x=element_blank(),
        panel.background = element_blank()) +
        xlab("UMAP 1") + ylab("UMAP 2") +
        ggtitle(NULL)

print(p)

SOFTWARE INFORMATION

print(sessionInfo())
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /software/openblas-0.3.13-el7-x86_64/lib/libopenblas_haswellp-r0.3.13.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C         LC_TIME=C           
##  [4] LC_COLLATE=C         LC_MONETARY=C        LC_MESSAGES=C       
##  [7] LC_PAPER=C           LC_NAME=C            LC_ADDRESS=C        
## [10] LC_TELEPHONE=C       LC_MEASUREMENT=C     LC_IDENTIFICATION=C 
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] caret_6.0-92            lattice_0.20-45         scPred_1.9.2           
##  [4] Seurat_4.3.0            ggalluvial_0.12.5       RColorBrewer_1.1-3     
##  [7] patchwork_1.1.1         elementalist_0.0.0.9000 lubridate_1.9.3        
## [10] forcats_1.0.0           stringr_1.5.0           dplyr_1.1.3            
## [13] purrr_1.0.2             readr_2.1.4             tidyr_1.3.0            
## [16] tibble_3.2.1            tidyverse_2.0.0         ggrepel_0.9.4          
## [19] ggplot2_3.4.4           cowplot_1.1.1           data.table_1.14.2      
## [22] SeuratObject_4.1.3      sp_1.6-0                Matrix_1.5-3           
## 
## loaded via a namespace (and not attached):
##   [1] plyr_1.8.7             igraph_1.3.1           lazyeval_0.2.2        
##   [4] splines_4.2.0          listenv_0.8.0          scattermore_0.8       
##   [7] digest_0.6.29          foreach_1.5.2          htmltools_0.5.2       
##  [10] fansi_1.0.3            magrittr_2.0.3         tensor_1.5            
##  [13] cluster_2.1.3          ROCR_1.0-11            tzdb_0.4.0            
##  [16] recipes_0.2.0          globals_0.15.0         gower_1.0.0           
##  [19] matrixStats_0.62.0     R.utils_2.11.0         hardhat_0.2.0         
##  [22] timechange_0.2.0       spatstat.sparse_3.0-0  colorspace_2.0-3      
##  [25] xfun_0.30              crayon_1.5.1           jsonlite_1.8.7        
##  [28] progressr_0.10.0       spatstat.data_3.0-0    survival_3.3-1        
##  [31] zoo_1.8-10             iterators_1.0.14       glue_1.6.2            
##  [34] polyclip_1.10-0        gtable_0.3.0           ipred_0.9-12          
##  [37] leiden_0.4.2           kernlab_0.9-30         future.apply_1.9.0    
##  [40] abind_1.4-5            scales_1.2.0           DBI_1.1.3             
##  [43] spatstat.random_3.1-3  miniUI_0.1.1.1         Rcpp_1.0.8.3          
##  [46] viridisLite_0.4.0      xtable_1.8-4           reticulate_1.24       
##  [49] stats4_4.2.0           lava_1.6.10            prodlim_2019.11.13    
##  [52] htmlwidgets_1.5.4      httr_1.4.3             ellipsis_0.3.2        
##  [55] ica_1.0-2              farver_2.1.0           R.methodsS3_1.8.1     
##  [58] pkgconfig_2.0.3        nnet_7.3-17            sass_0.4.1            
##  [61] uwot_0.1.14            deldir_1.0-6           utf8_1.2.2            
##  [64] labeling_0.4.2         tidyselect_1.2.0       rlang_1.1.2           
##  [67] reshape2_1.4.4         later_1.3.0            munsell_0.5.0         
##  [70] tools_4.2.0            cli_3.6.1              generics_0.1.2        
##  [73] ggridges_0.5.3         evaluate_0.15          fastmap_1.1.0         
##  [76] yaml_2.3.5             goftest_1.2-3          RhpcBLASctl_0.23-42   
##  [79] ModelMetrics_1.2.2.2   knitr_1.39             fitdistrplus_1.1-8    
##  [82] RANN_2.6.1             pbapply_1.5-0          future_1.25.0         
##  [85] nlme_3.1-157           mime_0.12              R.oo_1.24.0           
##  [88] compiler_4.2.0         rstudioapi_0.15.0      beeswarm_0.4.0        
##  [91] plotly_4.10.0          png_0.1-7              spatstat.utils_3.0-1  
##  [94] bslib_0.3.1            stringi_1.7.6          highr_0.9             
##  [97] vctrs_0.6.4            pillar_1.9.0           lifecycle_1.0.4       
## [100] spatstat.geom_3.0-6    lmtest_0.9-40          jquerylib_0.1.4       
## [103] RcppAnnoy_0.0.19       irlba_2.3.5            httpuv_1.6.5          
## [106] R6_2.5.1               promises_1.2.0.1       KernSmooth_2.23-20    
## [109] gridExtra_2.3          vipor_0.4.5            parallelly_1.31.1     
## [112] codetools_0.2-18       MASS_7.3-56            withr_2.5.0           
## [115] sctransform_0.3.5      harmony_1.1.0          parallel_4.2.0        
## [118] hms_1.1.3              grid_4.2.0             rpart_4.1.16          
## [121] timeDate_3043.102      class_7.3-20           rmarkdown_2.14        
## [124] Rtsne_0.16             pROC_1.18.0            spatstat.explore_3.0-6
## [127] shiny_1.7.1            ggbeeswarm_0.7.2